This markdown uses the spsurvey package developed by Michael Dumelle, Tom Kincaid, Tony Olsen, and Marc Weber at EPA to analyze the results of New York’s spatially-balanced probability-based survey in 2020-2021. The results include categorical condition extent estimates for statewide conditions, cumulative distribution functions for water quality parameters, and comparisons to the National Lakes Assessment (nationally and regionally), New Hampshire and to New York’s historical data (LMAS samples since 2010).
start.time <- Sys.time()
library(tidyverse)
library(huxtable)
library(ggplot2)
library(lubridate)
library(egg)
library(gridExtra)
library(ggpattern)
library(ggmap)
library(maps)
library(spsurvey) #This code was written for spsurvey v5.0.0
To run cat_analysis(), the dataframe will need to include columns for the categorical variable you are interested in, weight for each site, xcoordinate and ycoordinate. Optionally, you can also include a subpopulation or stratum column to separate results and site IDs for two-stage analysis (for example, if there are repeated sites).
In this script, weight is adjusted because of the proportional design. In the future, this can be excluded as the design should include weights already.
I am designating relevant variables from the dataset as well as creating new variables: mean dissolved oxygen from 0-2m, epilimentic means for ph and spconductance, total nitrogen and DIN:TP ratio.
# retrieve raw data from database
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Current")
source("new_database/Reading.LMAS.Data.R")
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
rm(list=setdiff(ls(), c('newdata',"start.time")))
#pick out relevant variables
lab<-newdata %>%
filter(SAMPLE_DATE>'2020-01-01') %>%
mutate(combined=paste(CHARACTERISTIC_NAME,
INFORMATION_TYPE,
RSLT_RESULT_SAMPLE_FRACTION,
sep = "_")) %>% #combining parameters into 1 name
select(LAKE_HISTORY_ID,
SAMPLE_DATE,
combined,
RSLT_RESULT_VALUE,
RSLT_LABORATORY_QUALIFIER,
RSLT_VALIDATOR_QUALIFIER,
RSLT_PROFILE_DEPTH) %>%
mutate(RSLT_RESULT_VALUE=ifelse(!is.na(RSLT_LABORATORY_QUALIFIER)&(RSLT_LABORATORY_QUALIFIER=="U"|RSLT_LABORATORY_QUALIFIER=="UE"),"0",RSLT_RESULT_VALUE),
RSLT_RESULT_VALUE=as.numeric(RSLT_RESULT_VALUE)) %>%
filter(!is.na(RSLT_RESULT_VALUE),
is.na(RSLT_VALIDATOR_QUALIFIER)|(RSLT_VALIDATOR_QUALIFIER!="R"), #removing rejected samples
combined %in% c('CHLOROPHYLL A_OW_TOTAL', #picking parameters
'PHOSPHORUS, TOTAL_OW_TOTAL',
"MICROCYSTIN_OW_NA",
"NITROGEN, NITRATE (AS N)_OW_TOTAL",
"NITROGEN, NITRATE-NITRITE_OW_TOTAL",
"NITROGEN, KJELDAHL, TOTAL_OW_TOTAL",
"NITROGEN, TOTAL_OW_TOTAL",
"DISSOLVED OXYGEN_DP_NA",
"TRUE COLOR_OW_TOTAL",
"PH_DP_NA",
"SPECIFIC CONDUCTANCE_DP_NA",
"TEMPERATURE_DP_NA",
"DEPTH, SECCHI DISK DEPTH_SD_NA",
"CALCIUM_OW_TOTAL",
"CHLORIDE_OW_TOTAL",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA",
"ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL",
"ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL",
"CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED",
"NITROGEN, AMMONIA (AS N)_OW_TOTAL")) %>%
select(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_RESULT_VALUE,RSLT_PROFILE_DEPTH) %>%
distinct(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_PROFILE_DEPTH,.keep_all = TRUE) %>%
rename(LAKE_ID=LAKE_HISTORY_ID,
chemical_name=combined,
result_value=RSLT_RESULT_VALUE,
profile_depth=RSLT_PROFILE_DEPTH)
# dissolved oxygen:
# keep DO values between 0 and 2m depth
lab2 <- lab %>%
mutate(keep=case_when(
!(chemical_name=="DISSOLVED OXYGEN_DP_NA") ~ "no",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth <= 2 ~ "yes",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth > 2 ~ "no")) %>%
filter(keep!="no") %>%
select(-keep)
# mean DO for each lake each date
DO <- aggregate(lab2$result_value,list(lab2$LAKE_ID,lab2$SAMPLE_DATE),FUN=mean) %>%
mutate(chemical_name="DISSOLVED OXYGEN_epi") %>%
rename(LAKE_ID=Group.1,
SAMPLE_DATE=Group.2,
result_value=x)
#thermocline
thermocline<-lab %>%
filter(chemical_name=='TEMPERATURE_DP_NA') %>%
mutate(thermocline=NA)
LID<-thermocline$LAKE_ID[1]
date<-thermocline$SAMPLE_DATE[1]
depth<-thermocline$profile_depth[1]
temp<-thermocline$result_value[1]
for(i in seq(nrow(thermocline))){
current<-thermocline[i,]
if(current$LAKE_ID==LID¤t$SAMPLE_DATE==date){
depth=current$profile_depth
if((temp-current$result_value)>1){
thermocline$thermocline[i]<-current$profile_depth
}
temp=current$result_value
}
else{
LID=current$LAKE_ID
date=current$SAMPLE_DATE
depth=current$profile_depth
temp=current$result_value
}
}
thermocline<-thermocline %>% #pull the lowest value for all
group_by(LAKE_ID,SAMPLE_DATE) %>%
mutate(thermocline=min(thermocline, na.rm=T)) %>%
ungroup() %>%
mutate(thermocline=ifelse(thermocline==Inf,NA,thermocline)) %>%
select(LAKE_ID,SAMPLE_DATE,thermocline,profile_depth) %>%
filter(!is.na(thermocline)) %>% #remove those without a thermocline
distinct()
#epilimentic means for ph and spconductance
epi<-lab %>%
inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>%
filter(chemical_name %in% c("PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA")) %>%
mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>% #convert ph to [H+]
distinct() %>%
arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth)
epi<-epi %>%
filter(profile_depth<=thermocline) %>% #shallower than thermocline
distinct() %>%
group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>%
summarize(Mean=mean(result_value,na.rm=TRUE), #creating epilimnetic mean
n=n()) %>%
filter(n>2) %>%
select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)
epi<-epi %>%
mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>% #[H+] back to pH
rename(result_value=Mean) %>%
mutate(chemical_name=case_when(
chemical_name=="PH_DP_NA"~"PH_epi",
chemical_name=="SPECIFIC CONDUCTANCE_DP_NA"~"SPECIFIC CONDUCTANCE_epi"
))
#hypolimnetic means for ph and dissolved oxygen, as above
hypo<-lab %>%
inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>%
filter(chemical_name %in% c("PH_DP_NA","DISSOLVED OXYGEN_DP_NA")) %>%
mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>%
distinct() %>%
arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth)
hypo<-hypo %>%
filter(profile_depth>=thermocline) %>% #deeper than thermocline
distinct() %>%
group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>%
summarize(Mean=mean(result_value,na.rm=TRUE),
n=n()) %>%
filter(n>2) %>%
select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)
hypo<-hypo %>%
mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>%
rename(result_value=Mean) %>%
mutate(chemical_name=case_when(
chemical_name=="PH_DP_NA"~"PH_hypo",
chemical_name=="DISSOLVED OXYGEN_DP_NA"~"DISSOLVED_OXYGEN_mg_L"
))
# read in site data
sites<-read.csv("Probability_Based_Sites_2020_2021.csv")
sites<-sites %>%
rename(siteID=SITE_ID,
xcoord=LON_DD83,
ycoord=LAT_DD83,
Eval_Status=EvalStatus) %>%
filter(Eval_Status!="") %>% #removing sites that we haven't yet evaluated
mutate(Eval_Status=trimws(Eval_Status))
# restrict to only the data in the probability study and spread the data
att<-merge(lab,epi,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>% #merging in epi means for pH and spcond
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,hypo,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>% #merging in hypo means for pH and DO
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,DO,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE)%>% #merging in epi DO (epa method)
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
filter(!chemical_name %in% c("DISSOLVED OXYGEN_DP_NA","PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA","TEMPERATURE_DP_NA")) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,sites,by=c('LAKE_ID'),all.y = TRUE) %>% distinct() #filtering by site list
att<-att %>% spread(chemical_name,result_value,fill = NA)
att<-att %>% filter((Eval_Status=="Target_Sampled"&!is.na(`CHLOROPHYLL A_OW_TOTAL`)&!is.na(`PHOSPHORUS, TOTAL_OW_TOTAL`)) | #keep samples with valid results in both chl and tp if sampled
(Eval_Status!="Target_Sampled") | #keep unsampled
(LAKE_ID %in% c("0703UWBXXX1","0801KAY0984A","1203MET0821","0602LUD0099","0801GUL0969"))) %>% #special lakes that would get otherwise deleted because they do not have valid data but were sampled and only have 1 sampling event
rename("CHLOROPHYLL_ug_L"=`CHLOROPHYLL A_OW_TOTAL`, #rename without spaces for cont_analysis later
"PHOSPHORUS_mg_L"=`PHOSPHORUS, TOTAL_OW_TOTAL`,
"ALKALINITY_mg_L"=`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,
"CHLORIDE_mg_L"=CHLORIDE_OW_TOTAL,
"CALCIUM_mg_L"=CALCIUM_OW_TOTAL,
"DOC_mg_L"=`CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED`,
"SECCHI_m"=`DEPTH, SECCHI DISK DEPTH_SD_NA`,
"TRUE_COLOR"=`TRUE COLOR_OW_TOTAL`,
"SPEC_CONDUCTANCE"=`SPECIFIC CONDUCTANCE_epi`,
"AMMONIA_N"=`NITROGEN, AMMONIA (AS N)_OW_TOTAL`)
#creating thresholds
att<-att %>%
# remove fields in the sites table that aren't relevant
select(-Accessible,-Comments,-Contact,-STATUS) %>%
# create total nitrogen
mutate(NITROGEN_mg_L=case_when(
!is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~
(`NITROGEN, NITRATE-NITRITE_OW_TOTAL`+`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`),
is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~ `NITROGEN, TOTAL_OW_TOTAL`)) %>%
# DIN:TP
mutate(NP_ratio=`NITROGEN, NITRATE (AS N)_OW_TOTAL`/PHOSPHORUS_mg_L) %>%
# trophic status
mutate(phos_trophic=case_when(
PHOSPHORUS_mg_L<=0.01 ~ "Oligotrophic",
between(PHOSPHORUS_mg_L,0.01,0.02) ~ "Mesotrophic",
PHOSPHORUS_mg_L>=0.02 ~ "Eutrophic")) %>%
mutate(chla_trophic=case_when(
CHLOROPHYLL_ug_L<=2 ~ "Oligotrophic",
between(CHLOROPHYLL_ug_L,2,8) ~ "Mesotrophic",
CHLOROPHYLL_ug_L>=8 ~ "Eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
PHOSPHORUS_mg_L<=0.016 ~ "Good",
between(PHOSPHORUS_mg_L, 0.016, 0.0279) ~ "Fair",
PHOSPHORUS_mg_L>=0.0279 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
NITROGEN_mg_L<=0.428 ~ "Good",
between(NITROGEN_mg_L, 0.428, 0.655) ~ "Fair",
NITROGEN_mg_L>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
CHLOROPHYLL_ug_L<=4.52 ~ "Good",
between(CHLOROPHYLL_ug_L, 4.52, 8.43) ~ "Fair",
CHLOROPHYLL_ug_L>=8.43 ~ "Poor")) %>%
# microcystin
mutate(microcystin=case_when(
is.na(`MICROCYSTIN_OW_NA`) ~"Non-detect",
`MICROCYSTIN_OW_NA`<8 ~ "Microcystin Detected",
`MICROCYSTIN_OW_NA`>=8 ~ "Most disturbed")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
`DISSOLVED OXYGEN_epi`<=3 ~ "Poor",
between(`DISSOLVED OXYGEN_epi`, 3, 5) ~ "Fair",
`DISSOLVED OXYGEN_epi`>=5 ~ "Good")) %>%
#nutrient limitation
mutate(N_LIMIT=case_when(
NITROGEN_mg_L/PHOSPHORUS_mg_L<=10 ~ "N-limited",
between(NITROGEN_mg_L/PHOSPHORUS_mg_L, 10, 20) ~ "Co-limited",
NITROGEN_mg_L/PHOSPHORUS_mg_L>=20 ~ "P-limited")) %>%
#secchi
mutate(secchi=case_when(
SECCHI_m<=2 ~ "Eutrophic",
between(SECCHI_m, 2, 5) ~ "Mesotrophic",
SECCHI_m>=5 ~ "Oligotrophic")) %>%
#zebra mussels
mutate(zebra=case_when(
CALCIUM_mg_L<=10 ~ "Not susceptible",
between(CALCIUM_mg_L, 10, 20) ~ "May be susceptible",
CALCIUM_mg_L>=20 ~ "Highly susceptible")) %>%
#cslap
mutate(conductance=case_when(
SPEC_CONDUCTANCE<=125 ~ "Soft",
between(SPEC_CONDUCTANCE, 125, 250) ~ "Average",
SPEC_CONDUCTANCE>=250 ~ "Hard")) %>%
mutate(color=case_when(
TRUE_COLOR<=10 ~ "Uncolored",
between(TRUE_COLOR, 10, 30) ~ "Weak",
TRUE_COLOR>=30 ~ "High")) %>%
mutate(ph=case_when(
`PH_epi`<=6.5 ~ "Acidic",
between(`PH_epi`, 6.5, 7.5) ~ "~Neutral",
between(`PH_epi`, 7.5, 8.5) ~ "Slightly alk",
`PH_epi`>=8.5 ~ "Highly alk")) %>%
#leech
mutate(leech=case_when(
PHOSPHORUS_mg_L<=0.030 & TRUE_COLOR<=20 ~ "Blue",
PHOSPHORUS_mg_L>0.030 & TRUE_COLOR<=20 ~ "Green",
PHOSPHORUS_mg_L<=0.030 & TRUE_COLOR>20 ~ "Brown",
PHOSPHORUS_mg_L>0.030 & TRUE_COLOR>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
`CHLORIDE_mg_L` <= 35 ~ "Low",
between(`CHLORIDE_mg_L`,35,250) ~ "Medium",
`CHLORIDE_mg_L` >= 250 ~ "High",
Eval_Status=="Target_Sampled" & is.na(`CHLORIDE_mg_L`) ~ "Low")) %>%
#algal composition
rename(CHLOROPHYTE=`CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA`,
CRYPTOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA`,
CYANOBACTERIA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA`,
DINOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA`,
PROBE_TOTAL=`CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA`) %>%
mutate(chlorophyte=case_when(
CHLOROPHYTE/PROBE_TOTAL<=0.25 ~ "Low",
between(CHLOROPHYTE/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CHLOROPHYTE/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cryptophyta=case_when(
CRYPTOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(CRYPTOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CRYPTOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cyanobacteria=case_when(
CYANOBACTERIA/PROBE_TOTAL<=0.25 ~ "Low",
between(CYANOBACTERIA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CYANOBACTERIA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(dinophyta=case_when(
DINOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(DINOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
DINOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
#alkalinity
mutate(alkalinity=case_when(
ALKALINITY_mg_L <= 60 ~ "Soft",
between(ALKALINITY_mg_L,60,120) ~ "Moderately hard",
between(ALKALINITY_mg_L,120,180) ~ "Hard",
ALKALINITY_mg_L >= 180 ~ "Very hard"))
# Create a Target/NonTarget status variable
att<-att %>%
mutate(statusTNT="Target",
statusTNT=ifelse(Eval_Status=="NonTarget","NonTarget",statusTNT))
# remove duplicates (multiple samples of same lake)
att <- att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(-has_aug)
att<-att %>% #creating adjusted weights
ungroup() %>%
mutate(WgtAdj=case_when(
PROB_CAT=="(1,4]"~(1828/29),
PROB_CAT=="(4,10]"~(2490/15),
PROB_CAT=="(10,20]"~(1003/18),
PROB_CAT=="(20,50]"~(616/20),
PROB_CAT==">50"~(500/33)))
#read in data on excursions
excursions <- read.csv("probability.data.excursions.csv")
excursions <- excursions %>%
select(parameter,n_exceedances,LAKE_ID) %>%
spread(parameter,n_exceedances,fill = 0) %>%
mutate(LAKE_ID=toupper(LAKE_ID)) %>%
mutate(all_params=case_when(
ammonia==0 & chlorophyll_a==0 & dissolved_oxygen==0 & nitrite==0 & ph==0 & phosphorus==0 ~0,
LAKE_ID=TRUE~1 #create fully supporting variable
)) %>%
rename(AMMONIA_excursions=ammonia,
DO_excursions=dissolved_oxygen,
NITRITE_excursions=nitrite,
PH_excursions=ph,
PHOSPHORUS_excursions=phosphorus,
Fully_support=all_params)
att<-merge(att,excursions,by="LAKE_ID") #merging in fishing use excursions
Additionally, I will load in NLA, NAP and NH data:
# read in NLA data
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
chem <- read.csv("nla_2017_water_chemistry_chla-data.csv") #read in chemistry data
chem <- chem %>%
filter(ANALYTE %in% c("NTL","CHLA","PTL","NITRATE_N","COLOR","CHLORIDE","MAGNESIUM","DOC","COND","AMMONIA_N","CALCIUM","PH"),
NARS_FLAG == "") %>%
select(UID,SITE_ID,DATE_COL,ANALYTE,RESULT) %>%
pivot_wider(names_from= ANALYTE ,values_from = RESULT,) %>%
rename(TN=NTL,TP=PTL)
secchi <- read.csv("nla_2017_secchi-data.csv") %>% #read in secchi data
mutate(SECCHI=(DISAPPEARS+REAPPEARS)/2, #average the two depths
DATE_COL=mdy(DATE_COL)) %>%
select(UID,SITE_ID,DATE_COL,SECCHI)
oxygen <- read.csv("nla_2017_profile-data.csv") #read in dissolved oxygen data
oxygen <- oxygen %>%
select(UID,SITE_ID,DATE_COL,DEPTH,OXYGEN) %>%
# dissolved oxygen:
# keep DO values between 0 and 2m depth
mutate(keep=case_when(
DEPTH <= 2 ~ "yes",
DEPTH > 2 ~ "no")) %>%
filter(keep=="yes") %>%
select(-keep)
# mean DO for each lake of top 2m
DO <- aggregate(oxygen$OXYGEN,list(oxygen$SITE_ID,oxygen$DATE_COL),FUN=mean)
# add back in
oxygen <- merge(oxygen,DO,by.x=c("SITE_ID","DATE_COL"),by.y=c("Group.1","Group.2"),all.x=TRUE)
oxygen$x <- as.numeric(oxygen$x)
oxygen <- oxygen %>%
select(-c(DEPTH,OXYGEN)) %>%
rename(OXYGEN=x) %>%
distinct()
att.nla <- merge(chem,oxygen,by=c("UID","SITE_ID","DATE_COL"),all.y=TRUE,all.x=TRUE) %>% #merging chemistry and oxygen data
mutate(DATE_COL=dmy(DATE_COL))
att.nla <- merge(att.nla,secchi,by=c("UID","SITE_ID","DATE_COL"),all.x=TRUE) #merging in secchi data
att.nla[,4:17] <- lapply(att.nla[,4:17],as.numeric) #convert all columns to numeric
att.nla$NP_ratio <- as.numeric(att.nla$NITRATE_N)/(as.numeric(att.nla$TP)/1000) #create n:p ratio
att.nla <- att.nla %>% #rename to match above
rename(CHLOROPHYLL_ug_L=CHLA,
NITROGEN_mg_L=TN,
PHOSPHORUS_ug_L=TP,
CHLORIDE_mg_L=CHLORIDE,
DOC_mg_L=DOC,
CALCIUM_mg_L=CALCIUM,
TRUE_COLOR=COLOR,
SPEC_CONDUCTANCE=COND,
MAGNESIUM_OW_TOTAL=MAGNESIUM,
SECCHI_m=SECCHI,
NP_ratio=NP_ratio)
# selecting parameters and adding trophic status
att.nla<-att.nla %>%
# trophic status
mutate(phos_trophic=case_when(
PHOSPHORUS_ug_L<=10 ~ "oligotrophic",
between(PHOSPHORUS_ug_L,10,20) ~ "mesotrophic",
PHOSPHORUS_ug_L>=20 ~ "eutrophic")) %>%
mutate(chla_trophic=case_when(
CHLOROPHYLL_ug_L<=2 ~ "oligotrophic",
between(CHLOROPHYLL_ug_L,2,8) ~ "mesotrophic",
CHLOROPHYLL_ug_L>=8 ~ "eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
PHOSPHORUS_ug_L<=16 ~ "Good",
between(PHOSPHORUS_ug_L, 16, 27.9) ~ "Fair",
PHOSPHORUS_ug_L>=27.9 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
NITROGEN_mg_L<=0.428 ~ "Good",
between(NITROGEN_mg_L, 0.428, 0.655) ~ "Fair",
NITROGEN_mg_L>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
CHLOROPHYLL_ug_L<=4.52 ~ "Good",
between(CHLOROPHYLL_ug_L, 4.52, 8.43) ~ "Fair",
CHLOROPHYLL_ug_L>=8.43 ~ "Poor")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
OXYGEN<=3 ~ "Poor",
between(OXYGEN, 3, 5) ~ "Fair",
OXYGEN>=5 ~ "Good")) %>%
#leech
mutate(leech=case_when(
PHOSPHORUS_ug_L<=30 & TRUE_COLOR<=20 ~ "Blue",
PHOSPHORUS_ug_L>30 & TRUE_COLOR<=20 ~ "Green",
PHOSPHORUS_ug_L<=30 & TRUE_COLOR>20 ~ "Brown",
PHOSPHORUS_ug_L>30 & TRUE_COLOR>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
CHLORIDE_mg_L <= 35 ~ "Low",
between(CHLORIDE_mg_L,35,250) ~ "Medium",
CHLORIDE_mg_L >= 250 ~ "High"))
sites <- read.csv("nla_2017_site_information-data.csv") #read in site info for filtering
nap.sites <- sites %>%
mutate(include=case_when(
AG_ECO9=="NAP" & AREA_HA>2.63 ~ "yes", #creating subcategory to keep sites in northern appalachian ecoregion with area >6.4 acres
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.sites <- sites %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes", #creating subcategory to keep sites with area >6.4 acres
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.att <- merge(att.nla,nla.sites,by=c("UID","SITE_ID"))
nap.att <- merge(att.nla,nap.sites,by=c("UID","SITE_ID"))
nh.att <- read.csv("New_Hampshire_Lakes_2017_Design_Status_20211027_KH_EPA.csv", na.strings=c(""," ","NA")) #read in site data
nh.att<-nh.att %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes", #create subcategory for keeping lakes larger than 6.4 acres
SITE_ID=TRUE ~ "no"))
chem <- read.csv("BasicNHChem_ForNY.csv") #read in chem data
#merge by site
nh.att <- merge(nh.att,chem,by.x="SITE_ID",by.y="NLA.ID",all.x=T) %>%
mutate(CHLORIDE=case_when(
CHLORIDE!="<3" ~ as.numeric(CHLORIDE), #method detection limit for chloride is 3, convert smaller values to 0
CHLORIDE=="<3" ~ 0)) %>%
mutate(CHLORIDE_threshold=case_when(
CHLORIDE<=35 ~ "Low",
between(CHLORIDE,35,250) ~ "Medium",
CHLORIDE>=250 ~ "High"
)) %>%
mutate(TN_UG.L=TN_UG.L/1000) %>%
rename(CHLOROPHYLL_ug_L=CHLOROPHYLL.A.x,
NITROGEN_mg_L=TN_UG.L,
PHOSPHORUS_ug_L=PHOSPHORUS.ug.L.x,
CHLORIDE_mg_L=CHLORIDE)
Here I perform the analysis using cat_analysis(). The relevant categorical variable / threshold should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe.
In the future, the weight argument should be part of the site design and might not be called WgtAdj.
#list variables you are interested in and defined above
vars <- c("phos_trophic", "chla_trophic", "TP_threshold", "TN_threshold", "CHLA_threshold", "microcystin", "d.oxygen", "N_LIMIT", "secchi", "zebra", "conductance", "color", "ph", "leech", "chloride","alkalinity","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
#analysis
CatExtent <- cat_analysis(
dframe=att,
vars=vars,
subpops = ,
siteID = "siteID",
weight = "WgtAdj", #name will probably change in future
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>% #remove totals
mutate(Category=factor(Category, levels=c("Poor","Fair","Good", #factor for plotting
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed",
"Excursion","Non-excursion")),
Indicator=case_when( #rename for plotting
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Hardness",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=="chloride"~"Chloride",
Indicator=TRUE~Indicator)
)
hux <- as_hux(table) #I use huxtable because kable doesn't work on my computer but that is also fine
number_format(hux) <- 2
theme_plain(hux)
| Indicator | Category | Estimate.P | LCB95.00Pct.P | UCB95.00Pct.P |
|---|---|---|---|---|
| Phosphorus | Eutrophic | 22.35 | 11.25 | 33.45 |
| Phosphorus | Mesotrophic | 34.33 | 18.35 | 50.31 |
| Phosphorus | Oligotrophic | 43.32 | 28.92 | 57.72 |
| Chlorophyll-a | Eutrophic | 11.43 | 4.97 | 17.89 |
| Chlorophyll-a | Mesotrophic | 69.33 | 56.12 | 82.53 |
| Chlorophyll-a | Oligotrophic | 19.24 | 7.08 | 31.41 |
| Total phosphorus | Fair | 18.31 | 6.53 | 30.09 |
| Total phosphorus | Good | 64.89 | 51.85 | 77.92 |
| Total phosphorus | Poor | 16.81 | 6.06 | 27.56 |
| Total nitrogen | Fair | 14.26 | 5.66 | 22.86 |
| Total nitrogen | Good | 66.79 | 52.56 | 81.02 |
| Total nitrogen | Poor | 18.95 | 5.88 | 32.02 |
| Total chlorophyll | Fair | 34.09 | 17.76 | 50.41 |
| Total chlorophyll | Good | 54.49 | 38.33 | 70.64 |
| Total chlorophyll | Poor | 11.43 | 4.97 | 17.89 |
| Microcystin | Non-detect | 100.00 | 100.00 | 100.00 |
| Dissolved oxygen | Fair | 1.28 | 0.00 | 3.46 |
| Dissolved oxygen | Good | 91.16 | 80.11 | 100.00 |
| Dissolved oxygen | Poor | 7.55 | 0.00 | 18.47 |
| Nutrient limitation | Co-limited | 17.32 | 3.51 | 31.12 |
| Nutrient limitation | N-limited | 0.78 | 0.00 | 2.13 |
| Nutrient limitation | P-limited | 81.90 | 68.01 | 95.80 |
| Secchi | Eutrophic | 46.37 | 31.37 | 61.37 |
| Secchi | Mesotrophic | 48.69 | 33.50 | 63.88 |
| Secchi | Oligotrophic | 4.94 | 0.80 | 9.09 |
| Zebra mussel susceptibility | Highly susceptible | 29.72 | 12.54 | 46.90 |
| Zebra mussel susceptibility | May be susceptible | 14.87 | 0.00 | 31.71 |
| Zebra mussel susceptibility | Not susceptible | 55.41 | 34.80 | 76.02 |
| Hardness | Average | 16.41 | 2.38 | 30.45 |
| Hardness | Hard | 23.19 | 7.91 | 38.48 |
| Hardness | Soft | 60.39 | 44.37 | 76.42 |
| Color | High | 68.64 | 52.20 | 85.07 |
| Color | Uncolored | 8.28 | 0.14 | 16.41 |
| Color | Weak | 23.09 | 10.25 | 35.92 |
| pH | ~Neutral | 47.34 | 30.03 | 64.65 |
| pH | Acidic | 15.26 | 4.42 | 26.10 |
| pH | Highly alk | 15.28 | 1.49 | 29.08 |
| pH | Slightly alk | 22.12 | 10.28 | 33.95 |
| Nutrient-color status | Blue | 21.59 | 7.15 | 36.04 |
| Nutrient-color status | Brown | 54.50 | 34.94 | 74.06 |
| Nutrient-color status | Green | 1.08 | 0.00 | 3.07 |
| Nutrient-color status | Murky | 22.83 | 6.76 | 38.90 |
| Chloride | High | 6.60 | 0.00 | 16.90 |
| Chloride | Low | 77.19 | 64.37 | 90.01 |
| Chloride | Medium | 16.21 | 5.60 | 26.81 |
| alkalinity | Hard | 11.47 | 0.18 | 22.75 |
| alkalinity | Moderately hard | 20.63 | 7.24 | 34.03 |
| alkalinity | Soft | 67.90 | 54.43 | 81.37 |
| chlorophyte | High | 75.45 | 58.60 | 92.30 |
| chlorophyte | Low | 13.42 | 0.00 | 28.43 |
| chlorophyte | Medium | 11.13 | 1.60 | 20.66 |
| cryptophyta | Low | 100.00 | 100.00 | 100.00 |
| cyanobacteria | High | 16.19 | 0.32 | 32.07 |
| cyanobacteria | Low | 65.93 | 47.82 | 84.03 |
| cyanobacteria | Medium | 17.88 | 6.75 | 29.00 |
| dinophyta | High | 2.11 | 0.00 | 5.90 |
| dinophyta | Low | 87.84 | 78.38 | 97.29 |
| dinophyta | Medium | 10.06 | 1.51 | 18.61 |
ny.cat <- table %>% filter(Indicator %in% c("Total phosphorus","Total nitrogen","Total chlorophyll","Dissolved oxygen","Chloride")) %>% mutate(Study="NY") #prep for NLA/NH/NAP comparison plots later
probability.cat <- table %>% filter(Indicator %in% c("Phosphorus","Chlorophyll-a","Secchi")) #prep for historical bias plot later
Here I perform the analysis using cont_analysis(). The relevant CONTINUOUS variable should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe with all. The output of this function is a list with 3 estimations: cumulative distribution function (CDF), percentiles, and means.
As above, in the future the weight argument should be part of the site design and might not be called WgtAdj.
#To conduct analysis on a continuous variable, using a new list of CONTINUOUS variables, also note that cont_analysis() doesn't like variable names that have spaces so you will need to rename these. It's helpful to put the units in the name.
myvars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_mg_L","CHLORIDE_mg_L","DISSOLVED_OXYGEN_mg_L","DOC_mg_L","CALCIUM_mg_L","SECCHI_m","TRUE_COLOR","SPEC_CONDUCTANCE","AMMONIA_N","MAGNESIUM_OW_TOTAL","PH_hypo","ARSENIC_BS_TOTAL", "ARSENIC_OW_TOTAL", "IRON_BS_TOTAL", "IRON_OW_TOTAL", "MAGNESIUM_BS_TOTAL", "MAGNESIUM_OW_TOTAL","ALKALINITY_mg_L","NP_ratio")
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = myvars,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
NY<-analysis$CDF %>% #prep for comparison with NLA/NAP/NH plots later
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
mutate(Study="NY") %>%
mutate(Value=case_when(
Indicator=="PHOSPHORUS_mg_L"~as.numeric(Value)*1000,
Indicator=TRUE~as.numeric(Value)
)) %>%
mutate(Indicator=case_when(
Indicator=="PHOSPHORUS_mg_L"~"PHOSPHORUS_ug_L",
Indicator=TRUE~Indicator
))
str(analysis)
## List of 3
## $ CDF :'data.frame': 659 obs. of 15 variables:
## ..$ Type : chr [1:659] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation : chr [1:659] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:659] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Value : num [1:659] 1.00e-10 4.45e-01 5.50e-01 6.84e-01 1.05 ...
## ..$ nResp : num [1:659] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ Estimate.P : num [1:659] 1.38 2.77 3.45 4.13 4.81 ...
## ..$ StdError.P : num [1:659] 1.22 1.7 1.82 1.95 2 ...
## ..$ MarginofError.P: num [1:659] 2.39 3.32 3.56 3.81 3.93 ...
## ..$ LCB95Pct.P : num [1:659] 0 0 0 0.318 0.886 ...
## ..$ UCB95Pct.P : num [1:659] 3.77 6.09 7.01 7.95 8.74 ...
## ..$ Estimate.U : num [1:659] 30.8 61.6 76.8 91.9 107.1 ...
## ..$ StdError.U : num [1:659] 27 37.4 39.6 41.9 42.6 ...
## ..$ MarginofError.U: num [1:659] 52.9 73.4 77.5 82.1 83.5 ...
## ..$ LCB95Pct.U : num [1:659] 0 0 0 9.76 23.54 ...
## ..$ UCB95Pct.U : num [1:659] 83.7 135 154.3 174 190.6 ...
## $ Pct :'data.frame': 147 obs. of 10 variables:
## ..$ Type : chr [1:147] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:147] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:147] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Statistic : chr [1:147] "5Pct" "10Pct" "25Pct" "50Pct" ...
## ..$ nResp : num [1:147] 5 9 15 26 33 45 48 4 4 10 ...
## ..$ Estimate : num [1:147] 1.08 1.08 1.08 1.08 1.08 ...
## ..$ StdError : num [1:147] 0.411 0.357 0.433 0.586 1.209 ...
## ..$ MarginofError: num [1:147] 0.805 0.7 0.848 1.148 2.369 ...
## ..$ LCB95Pct : num [1:147] 0 0.513 1.564 2.542 4.584 ...
## ..$ UCB95Pct : num [1:147] 1.65 1.94 3.3 4.89 9.43 ...
## $ Mean:'data.frame': 21 obs. of 9 variables:
## ..$ Type : chr [1:21] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:21] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:21] "CHLOROPHYLL_ug_L" "NITROGEN_mg_L" "PHOSPHORUS_mg_L" "CHLORIDE_mg_L" ...
## ..$ nResp : int [1:21] 55 47 60 55 33 41 33 59 35 38 ...
## ..$ Estimate : num [1:21] 5.0524 0.5559 0.0186 44.8451 4.8785 ...
## ..$ StdError : num [1:21] 0.45911 0.08874 0.00235 18.67918 0.74625 ...
## ..$ MarginofError: num [1:21] 0.8998 0.1739 0.0046 36.6105 1.4626 ...
## ..$ LCB95Pct : num [1:21] 4.153 0.382 0.014 8.235 3.416 ...
## ..$ UCB95Pct : num [1:21] 5.9522 0.7298 0.0232 81.4556 6.3411 ...
Here I have plotted the analyses from above. The categorical variables are plotted as dots, maps and bars; the continuous variables are plotted as a distribution function. They are also grouped by QAPP parameter categories in the “Grouped plots” tab.
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_point()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Condition estimates across NYS Ponded Waters",y="Percent of Total",x="Condition category")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
#To plot a cumulative distribution function:
plot <- ggplot(analysis$CDF,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
facet_wrap(.~Indicator, scales="free")+
guides(fill=FALSE,shape=FALSE)+
labs(y="Percent of lakes")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
#fetch map
states <- map_data("state")
ny <- subset(states, region %in% c("new york"))
att$CHLA_threshold <- factor(att$CHLA_threshold, levels=c("Good","Fair","Poor"))
ggplot() +
geom_polygon(data=ny,aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) +
geom_point(data=att,aes(x=xcoord,y=ycoord,color=CHLA_threshold))+
guides(fill=FALSE)+
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
#list variables you are interested in and defined above
trophic <- c("phos_trophic","chla_trophic","N_LIMIT","secchi","leech","color")
minerals <- c("zebra","alkalinity")
in.situ <- c("ph","conductance")
habs <- c("microcystin","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
fishing <- c("AMMONIA_excursions","DO_excursions","NITRITE_excursions","PH_excursions","Fully_support")
vars.list <- list(Trophic=trophic,Minerals=minerals,`In Situ`=in.situ,HABs=habs,`Fishing use`=fishing) #name groups for labeling in plots
n<-0
for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
n<-n+1
CatExtent <- cat_analysis(
dframe=att,
vars=i,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>%
mutate(Category=case_when(
Category==0 ~ "Non-excursion", #for fishing use
Category==1 ~ "Excursion",
Category=TRUE~Category
)) %>%
mutate(Category=factor(Category, levels=c("Poor","Fair","Good", #factor for plots
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed",
"Excursion","Non-excursion")),
Indicator=case_when( #rename for plots
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Conductance",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=='alkalinity'~"Alkalinity",
Indicator=="chlorophyte"~"% Chlorophyte",
Indicator=="cryptophyta"~"% Cryptophyta",
Indicator=="cyanobacteria"~"% Cyanobacteria",
Indicator=="dinophyta"~"% Dinophyta",
Indicator=="AMMONIA_excursions"~"Ammonia",
Indicator=="DO_excursions"~"Dissolved oxygen",
Indicator=="NITRITE_excursions"~"Nitrite",
Indicator=="PH_excursions"~"pH",
Indicator=="Fully_support"~"All parameters",
Indicator=TRUE~Indicator)
)
if(names(vars.list)[n]=="Fishing use"){ #plot fishing use differently
plot1<-ggplot(table[table$Indicator=="All parameters",],aes(x=Category,y=Estimate.P,fill=Category)) + #all parameter normal bar plot
geom_col()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
ylim(0,100)+
labs(y="Percent of Total",x="Fully supporting",title="Fishing Use Parameters")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
scale_fill_grey()
plot2<-ggplot(table[table$Indicator!="All parameters",],aes(x=Indicator,y=Estimate.P,fill=Category)) + #stacked plot for individual parameters
geom_bar(position="stack", stat="identity")+
ylim(0,100)+
labs(y="",x="Parameter",title=" ")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
scale_fill_grey()
plot1 <- set_panel_size(p = plot1, width = unit(4, "cm"), height = unit(4,"cm"))
plot2 <- set_panel_size(p = plot2, width = unit(6, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot1,plot2,ncol=2)
}else{ #all other groups are plotted here
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_col()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
labs(y="Percent of Total",x="Condition category",title=paste(names(vars.list)[n],"Parameters"))+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
}
trophic <- c("CHLOROPHYLL_ug_L",
"NITROGEN_mg_L",
"PHOSPHORUS_mg_L",
"SECCHI_m",
"TRUE_COLOR",
"NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
"PH_hypo",
"SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")
habs <- c("CYANOBACTERIA")
vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals,HABs=habs) #named for plotting
n<-0
for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
n<-n+1
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = i,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
table<-analysis$CDF %>%
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
filter(Indicator %in% i) %>%
mutate(Indicator=case_when( #rename for plots
Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
Indicator=="PHOSPHORUS_mg_L"~"Total phosphorus (mg/L)",
Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
Indicator=="SECCHI_m"~"Secchi disk depth (m)",
Indicator=="TRUE_COLOR"~"True color (PCU)",
Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
Indicator=="PH_hypo"~"pH (SU)",
Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
Indicator=="NP_ratio"~"DIN:TP ratio",
Indicator=="CYANOBACTERIA"~"% Cyanobacteria",
Indicator=TRUE~Indicator
)) %>%
mutate(Lower=case_when( #defining lower thresholds for plots
Indicator=="Chlorophyll-a (ug/L)"~2,
Indicator=="Total phosphorus (mg/L)"~0.01,
Indicator=="Specific conductance (mS/cm)"~125,
Indicator=="Alkalinity (mg/L)"~60,
Indicator=="Calcium (mg/L)"~10,
Indicator=="True color (PCU)"~10,
Indicator=="Secchi disk depth (m)"~2,
Indicator=TRUE~as.numeric(""))) %>%
mutate(Upper=case_when( #defining upper thresholds for plots
Indicator=="Chlorophyll-a (ug/L)"~8,
Indicator=="Total phosphorus (mg/L)"~0.02,
Indicator=="Specific conductance (mS/cm)"~250,
Indicator=="Alkalinity (mg/L)"~120,
Indicator=="Calcium (mg/L)"~20,
Indicator=="True color (PCU)"~20,
Indicator=="Secchi disk depth (m)"~5,
Indicator=TRUE~as.numeric("")))
plot<-ggplot(table,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
guides(fill="none",shape="none")+
facet_wrap(.~Indicator, scales="free")+
geom_vline(aes(xintercept=Lower),linetype="dashed")+
geom_vline(aes(xintercept=Upper),linetype="dashed")+
labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))+
scale_color_grey()
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
Here I make the comparisons to NLA and NH data from above, as well as to historical LMAS sampling. Thresholds for NLA, NH and NY comparisons are from the NLA and are “good”, “fair”, and “poor” for chlorophyll-a, phosphorus, nitrogen and epilimnetic dissolved oxygen. There are no thresholds for chloride.
The values for LMAS sampling are derived from a separate script that counts the number of lakes in each size class from samples in the last 10 years.
## During execution of the program, 3 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, 10 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
plot <- ggplot(cats,aes(x=Category,y=Estimate.P,color=Study,shape=Study))+
geom_point(position=position_dodge(width=0.2))+
ylim(0,100)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=.25, position=position_dodge(width=0.2))+
facet_wrap(.~Indicator, scales="free")+
labs(title="Condition category extent estimates")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
"NITROGEN_mg_L",
"PHOSPHORUS_ug_L",
"SECCHI_m",
"TRUE_COLOR",
"NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
"PH_hypo",
"SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")
vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals)
n<-0
for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
n<-n+1
table<-all %>%
filter(Indicator %in% i) %>%
mutate(Indicator=case_when(
Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
Indicator=="PHOSPHORUS_ug_L"~"Total phosphorus (ug/L)",
Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
Indicator=="SECCHI_m"~"Secchi disk depth (m)",
Indicator=="TRUE_COLOR"~"True color (PCU)",
Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
Indicator=="PH_hypo"~"pH (SU)",
Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
Indicator=="DOC_mg_L"~"Dissolved organic carbon (mg/L)",
Indicator=="NP_ratio"~"DIN:TP ratio",
Indicator=TRUE~Indicator
)) %>%
filter((Indicator=="Chlorophyll-a (ug/L)"&Value<50)| #cutting off extreme values for certain variables
(Indicator=="Chloride (mg/L)"&Value<250)|
(Indicator=="Total nitrogen (mg/L)"&Value<2.5)|
(Indicator=="Total phosphorus (ug/L)"&Value<150)|
(Indicator=="Dissolved organic carbon (mg/L)"&Value<50)|
(Indicator=="Calcium (mg/L)"&Value<50)|
(Indicator=="Conductance (mS/cm)" & Value<1000)|
(Indicator=="Magnesium, open water (ug/L)"&Value<25000)|
(Indicator=="DIN:TP ratio"&Value<50)|
!(Indicator %in% c("Chlorophyll-a (ug/L)","Chloride (mg/L)","Total nitrogen (mg/L)", "Total phosphorus (ug/L)","Dissolved organic carbon (mg/L)","Calcium (mg/L)","Conductance (mS/cm)","Magnesium, open water (ug/L)","DIN:TP ratio"))) %>%
mutate(Study=factor(Study, levels=c("NY","NLA","NAP","NH")))
plot <- ggplot(table,aes(x=Value,y=Estimate.P,color=Study,shape=Study,linetype=Study))+
geom_line()+
ylim(0,100)+
scale_linetype_manual(values=c("solid","dotdash", "dotted","dashed"))+
facet_wrap(.~Indicator, scales="free")+
labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
The plot below shows the distribution of the size classes and trophic classes included in the data frame. For comparison, I’ve plotted the proportion of NYS lakes sampled in each size class since 2010.
#Plotting data frame distribution as well as NYS sample distribution collected since 2010
#LMAS numbers retrieved in 2020, script to do so is in Probability.Results.2021.rmd
frame<-att %>%
select(PROB_CAT,AREA_CAT) %>%
rename(size_category=AREA_CAT) %>%
distinct() %>%
mutate(Probability_Sites = case_when( #statewide estimate from sampling design frame
endsWith(size_category, "(1,4]") ~ ((1828/6437)*100),
endsWith(size_category, "(4,10]") ~ ((2490/6437)*100),
endsWith(size_category, "(10,20]") ~ ((1003/6437)*100),
endsWith(size_category, "(20,50]") ~ ((616/6437)*100),
endsWith(size_category, ">50") ~ ((500/6437)*100))) %>%
mutate(LMAS_Sites = case_when(
endsWith(size_category, "(1,4]") ~ (7.64),
endsWith(size_category, "(4,10]") ~ (16.7),
endsWith(size_category, "(10,20]") ~ (16.2),
endsWith(size_category, "(20,50]") ~ (19.3),
endsWith(size_category, ">50") ~ (40.1))) %>%
rename("Statewide"=Probability_Sites,
"LMAS"=LMAS_Sites) %>%
ungroup() %>%
select(-PROB_CAT) %>%
gather(study,percentage,-size_category) %>%
arrange(study,size_category) %>%
mutate(percentage=as.numeric(percentage),
size_category=factor(size_category,levels=c("(1,4]","(4,10]","(10,20]","(20,50]",">50")))
plot <- ggplot(frame, aes(x=size_category,y=percentage,fill=study)) +
geom_bar(stat = "identity", position = position_dodge())+
labs(title="Size classes",y="Percent of Total",x="Size category (hectares)")+
scale_fill_grey()+
theme(legend.position = "none")
plot1 <- set_panel_size(p = plot, width = unit(6, "cm"), height = unit(4,"cm")) #save plot for later
forplot<-probability.cat %>%
mutate(study="Statewide") %>%
add_row(Indicator="Chlorophyll-a",
Category="Eutrophic",
study="LMAS",
Estimate.P=((215/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((182/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((76/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Eutrophic",
study="LMAS",
Estimate.P=((211/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((141/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((121/473)*100)) %>%
add_row(Indicator="Secchi",
Category="Eutrophic",
study="LMAS",
Estimate.P=((371/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((279/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((50/700)*100)) %>%
mutate(LCB95Pct.P=ifelse(is.na(LCB95Pct.P),Estimate.P,LCB95Pct.P),
UCB95Pct.P=ifelse(is.na(UCB95Pct.P),Estimate.P,UCB95Pct.P))
plot <- ggplot(forplot, aes(x=Category,fill=study)) +
geom_bar_pattern(stat = "identity", position = position_dodge(),
aes(y=Estimate.P,
pattern_type = study,
pattern_fill = study),
pattern= 'magick',
fill= 'white',
colour= 'black',
pattern_scale = 0.5,
pattern_key_scale_factor = 1.1)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
position=position_dodge(.9))+
facet_wrap(~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Trophic Classes across NYS versus LMAS samples since 2010",y="Percent of Total",x="Trophic Classes")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'))+
scale_pattern_type_discrete(choices = c("gray15","bricks"))
# plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
#
# gridExtra::grid.arrange(plot)
#plot for just phosphorus trophic bias
plot2 <- ggplot(forplot[forplot$Indicator=="Phosphorus",], aes(x=Category,fill=study,y=Estimate.P)) +
geom_bar(stat = "identity", position = position_dodge())+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
position=position_dodge(.9))+
labs(title="Phosphorus",x="Trophic class",y="")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'))+
scale_fill_grey()
plot2 <- set_panel_size(p = plot2, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot1, plot2,ncol=2) #plot both side by side
sessionInfo(package=NULL)
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] spsurvey_5.0.1 survey_4.1-1 survival_3.2-13 Matrix_1.3-4
## [5] sf_1.0-3 maps_3.4.0 ggmap_3.0.0 ggpattern_0.4.2
## [9] egg_0.4.5 gridExtra_2.3 lubridate_1.8.0 huxtable_5.4.0
## [13] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [17] readr_2.0.2 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [21] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-153 bitops_1.0-7 fs_1.5.0
## [4] httr_1.4.2 tools_4.1.2 backports_1.3.0
## [7] bslib_0.3.1 utf8_1.2.2 R6_2.5.1
## [10] AlgDesign_1.2.0 KernSmooth_2.23-20 DBI_1.1.1
## [13] colorspace_2.0-2 withr_2.4.2 sp_1.4-5
## [16] tidyselect_1.1.1 compiler_4.1.2 cli_3.1.0
## [19] rvest_1.0.2 xml2_1.3.2 labeling_0.4.2
## [22] sass_0.4.0 scales_1.1.1 classInt_0.4-3
## [25] proxy_0.4-26 commonmark_1.7 crossdes_1.1-1
## [28] digest_0.6.28 minqa_1.2.4 rmarkdown_2.11
## [31] jpeg_0.1-9 pkgconfig_2.0.3 htmltools_0.5.2
## [34] lme4_1.1-27.1 highr_0.9 dbplyr_2.1.1
## [37] fastmap_1.1.0 rlang_0.4.12 readxl_1.3.1
## [40] rstudioapi_0.13 farver_2.1.0 jquerylib_0.1.4
## [43] generics_0.1.1 jsonlite_1.7.2 gtools_3.9.2
## [46] magrittr_2.0.1 Rcpp_1.0.7 munsell_0.5.0
## [49] fansi_0.5.0 lifecycle_1.0.1 stringi_1.7.5
## [52] yaml_2.2.1 MASS_7.3-54 plyr_1.8.6
## [55] crayon_1.4.2 deldir_1.0-6 lattice_0.20-45
## [58] haven_2.4.3 splines_4.1.2 hms_1.1.1
## [61] knitr_1.36 pillar_1.6.4 boot_1.3-28
## [64] rjson_0.2.20 reprex_2.0.1 glue_1.5.0
## [67] evaluate_0.14 mitools_2.4 modelr_0.1.8
## [70] nloptr_1.2.2.3 png_0.1-7 vctrs_0.3.8
## [73] tzdb_0.2.0 RgoogleMaps_1.4.5.3 cellranger_1.1.0
## [76] gtable_0.3.0 assertthat_0.2.1 xfun_0.28
## [79] broom_0.7.10 e1071_1.7-9 class_7.3-19
## [82] units_0.7-2 ellipsis_0.3.2
end.time <- Sys.time()
elapsed.time <- round((end.time - start.time), 3)
print(elapsed.time)
## Time difference of 15.361 mins